This notebooks explores the results from running cell type annotation
with SingleR using the NBAtlas. The NBAtlas reference was
aggregated with SingelR prior to model training.
In this notebook, we visualize inferred cell type annotations directly, compare them to normal consensus cell types, and validate cell type assignments with marker genes.
options(readr.show_col_types = FALSE)
suppressPackageStartupMessages({
library(ggplot2)
library(patchwork)
library(SingleCellExperiment)
})
theme_set(theme_bw())
# Define color ramp for shared use in the heatmaps
heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))
# Set heatmap padding option
ComplexHeatmap::ht_opt(TITLE_PADDING = grid::unit(0.6, "in"))
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
module_dir <- file.path(repository_base, "analyses", "cell-type-neuroblastoma-04")
ref_dir <- file.path(module_dir, "references")
results_dir <- file.path(module_dir, "results", "singler")
data_dir <- file.path(repository_base, "data", "current", "SCPCP000004")
# SingleR files
singler_files <- list.files(
path = results_dir,
pattern = "_singler-annotations\\.tsv",
recursive = TRUE,
full.names = TRUE
) |>
# add names as library id
purrr::set_names(
\(x) {
stringr::str_remove(basename(x), "_singler-annotations.tsv")
}
)
# merged SCE file
# OBTAINED FROM PORTAL DIRECTLY on 2025-07-09
sce_file <- file.path(
data_dir,
"SCPCP000004_merged.rds"
)
# broad consensus cell type groups
validation_url <- "https://raw.githubusercontent.com/AlexsLemonade/OpenScPCA-analysis/refs/heads/main/analyses/cell-type-consensus/references/consensus-validation-groups.tsv"
# palette file
palette_file <- file.path(
module_dir,
"palettes",
"harmonized-cell-type-palette.tsv"
)
# marker genes for validation
consensus_markers_file <- file.path(ref_dir, "consensus-marker-genes.tsv")
nbatlas_markers_file <- file.path(ref_dir, "nbatlas-marker-genes.tsv")
# Source Jaccard and heatmap utilities functions
source(file.path(module_dir, "scripts", "utils", "jaccard-utils.R"))
#' Function to update NBAtlas labels
#' This function matches NBAtlas labels to consensus labels where appropriate to facilitate plots
#'
#' @param df Data frame with columns to recode
#' @param label_column Name of column with current labels
#' @param recoded_column Name of new column to create with harmonized labels
#'
#' @returns Data frame with new column as specified containing recoded labels
harmonize_celltypes <- function(df, label_column, recoded_column) {
df |>
dplyr::mutate(
{{ recoded_column }} := dplyr::case_when(
# nbatlas ~ consensus
{{ label_column }} %in% c("Myeloid", "Fibroblast") ~ stringr::str_to_lower({{ label_column }}),
{{ label_column }} == "Endothelial" ~ "endothelial cell",
{{ label_column }} == "Plasma" ~ "plasma cell",
{{ label_column }} == "NK cell" ~ "natural killer cell",
# also make the different stromal labels more clearly distinguishable
{{ label_column }} == "stromal cell" ~ "stromal cell (consensus)",
{{ label_column }} == "Stromal other" ~ "Stromal other (NBAtlas)",
.default = {{ label_column }}
)
)
}
#' Create a faceted UMAP panel where each panel has only one cell type colored
#' Adapted from scpca-nf:
#' https://github.com/AlexsLemonade/scpca-nf/blob/4bb82aa635b572a62f2028dbec587fcfd2155e26/templates/qc_report/celltypes_qc.rmd#L134-L221
#'
#' @param umap_df Data frame with UMAP1 and UMAP2 columns
#' @param annotation_column Column containing broad cell type annotation
#' @param celltype_colors Named vector of colors to use for each broader validation group
#' @param facet_type Whether to use facet_wrap or facet_grid
#' @param annotation_type_column Additional column to use if facet_type is "grid"
#'
#' @return ggplot object containing a faceted UMAP where each cell type is a facet.
#' In each panel, the cell type of interest is colored red and all other cells are grey.
faceted_umap <- function(umap_df,
annotation_column,
celltype_colors,
facet_type = c("wrap", "grid"),
annotation_type_column = NULL) {
facet_type <- match.arg(facet_type)
# color by the annotation column but only color one cell type at a time
faceted_umap <- ggplot(
umap_df,
aes(x = UMAP1, y = UMAP2, color = {{ annotation_column }})
) +
# set points for all "other" points
geom_point(
data = dplyr::select(
umap_df, -{{ annotation_column }}
),
color = "gray90",
alpha = 0.5,
size = 0.5
) +
scale_color_manual(values = celltype_colors) +
# set points for desired cell type
geom_point(size = 0.5, alpha = 0.5) +
coord_equal() +
theme_bw() +
theme(
legend.position = "none",
axis.ticks = element_blank(),
axis.text = element_blank()
)
# add faceting as specified
if (facet_type == "wrap") {
faceted_umap <- faceted_umap +
facet_wrap(
vars({{ annotation_column }}),
ncol = 4
)
} else {
faceted_umap <- faceted_umap +
facet_grid(
rows = vars({{ annotation_column }}),
cols = vars({{ annotation_type_column }})
)
}
return(faceted_umap)
}
#' Prepare and create a marker gene expression dotplot
#' Adapted from corresponding code in this ews-nf report:
#' https://github.com/AlexsLemonade/ews-nf/blob/bd38f2bd5ae581ea7dcbd98c5ef717afb9015766/templates/summary-report.Rmd
#'
#' @param merged_sce Merged SCE object to plot with a `cell_id` column in the colData
#' @param markers_df Data frame of marker genes for validation.
#' This function expects columns `marker_gene_label` (cell types), `ensembl_gene_id`, and `gene_symbol`
#' @param singler_df Data frame of singler annotations to plot.
#' This function expects columns called `label_recoded` (cell types) and `cell_id`.
#' The `cell_id` values should match the `cell_id` colData in the merged_sce
#' @param total_cells_df Data frame of cell counts and plot order.
#' This function expects columns called `label_recoded` (cell types), `y_label` (cell types with (total cells) as factor for plot order), and `total_cells`
#' @param expressed_genes Vector of genes that are expressed in the merged_sce
#' @param bar_order Vector for the annotation bar order
#' @param min_cells Only include genes present in at least this many cells
#'
#' @returns Dotplot object
generate_dotplot <- function(
merged_sce,
markers_df,
singler_df,
total_cells_df,
expressed_genes,
bar_order,
min_cells = 0) {
all_markers <- markers_df |>
dplyr::pull(ensembl_gene_id) |>
unique()
# consider only markers that are expressed
expressed_markers <- intersect(all_markers, expressed_genes)
# get logcounts from merged_sce for expressed genes
gene_exp_df <- scuttle::makePerCellDF(
merged_sce,
features = expressed_markers,
assay.type = "logcounts",
use.coldata = "cell_id",
use.dimred = FALSE
) |>
tidyr::pivot_longer(starts_with("ENSG"), names_to = "ensembl_gene_id", values_to = "logcounts") |>
dplyr::mutate(detected = logcounts > 0)
# Join with cell type results and marker gene info
all_info_df <- singler_df |>
dplyr::left_join(gene_exp_df, by = "cell_id") |>
# account for the same gene being present in multiple cell types
dplyr::left_join(markers_df, by = "ensembl_gene_id", relationship = "many-to-many")
# define y- and x- axis orders
# y axis - singler
y_label_order <- rev(total_cells_df$y_label)
# x axis - marker genes. this uses the passed in bar_order
markers_df <- markers_df |>
dplyr::filter(ensembl_gene_id %in% expressed_markers) |>
dplyr::mutate(marker_gene_label = factor(marker_gene_label, levels = bar_order)) |>
dplyr::arrange(marker_gene_label)
marker_gene_order <- markers_df |>
dplyr::pull(gene_symbol) |>
unique()
group_stats_df <- all_info_df |>
# remove genes that aren't present in final annotations
dplyr::filter(gene_symbol %in% marker_gene_order) |>
# for each assigned cell type/marker gene combo get total detected and mean expression
dplyr::group_by(label_recoded, marker_gene_label, ensembl_gene_id) |>
dplyr::summarize(
detected_count = sum(detected),
mean_exp = mean(logcounts)
) |>
dplyr::ungroup() |>
# add total cells
dplyr::left_join(total_cells_df, by = "label_recoded") |>
# get total percent expressed
dplyr::mutate(percent_exp = (detected_count / total_cells) * 100) |>
# add in validation group for marker genes
# this includes all possible marker genes and all possible validation group assignments
dplyr::left_join(
markers_df,
by = c("ensembl_gene_id", "marker_gene_label"),
relationship = "many-to-many"
)
dotplot_df <- group_stats_df |>
dplyr::filter(
# remove lowly expressed marker genes
mean_exp > 0,
percent_exp > 10,
# only show cells that have above the minimum total number
total_cells > min_cells
) |>
# set orders of gene symbol and validation groups
dplyr::mutate(
y_label = factor(y_label, y_label_order),
gene_symbol = factor(gene_symbol, levels = marker_gene_order),
marker_gene_label = factor(marker_gene_label, levels = bar_order)
)
### Make the dotplot!
dotplot <- ggplot(dotplot_df) +
aes(
y = y_label,
x = gene_symbol,
color = mean_exp,
size = percent_exp
) +
geom_point() +
scale_color_viridis_c(option = "magma") +
facet_grid(cols = vars(marker_gene_label), scales = "free", space = "free") +
theme_classic() +
theme(
strip.background = element_rect(fill = "transparent", color = NA),
strip.placement = "outside",
strip.text.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.ticks.x = element_blank(),
text = element_text(size = 14),
panel.spacing = unit(0.5, "lines") # adjust spacing and match with annotation bar
) +
labs(
x = "Validation marker genes",
y = "SingleR label",
color = "Mean gene expression",
size = "Percent cells expressed"
)
# add annotation bar aligning marker genes with validation group
color_bar <- ggplot(dotplot_df, aes(x = gene_symbol, y = 1, fill = marker_gene_label)) +
geom_tile() +
facet_grid(cols = vars(marker_gene_label), scales = "free", space = "free") +
scale_fill_manual(values = celltype_pal, breaks = levels(dotplot_df$marker_gene_label)) +
ggmap::theme_nothing() +
theme(
strip.background = element_rect(fill = "transparent", color = NA),
strip.text.x = element_text(angle = 90, hjust = 0, vjust = 0.5, size = 12),
strip.placement = "outside",
legend.position = "none",
panel.spacing = unit(0.5, "lines"),
strip.clip = "off"
) +
labs(fill = "")
combined_plot <- color_bar / dotplot +
patchwork::plot_layout(ncol = 1, heights = c(0.1, 4))
return(combined_plot)
}
Read SCE object to get consensus cell types and UMAP coordinates:
merged_sce <- readRDS(sce_file)
# immediately remove assays we don't need for space
assay(merged_sce, "spliced") <- NULL
assay(merged_sce, "counts") <- NULL
# RUH ROH!!!
# https://github.com/AlexsLemonade/scpcaTools/issues/298
merged_sce$cell_id <- colnames(merged_sce)
Read cell type data frames:
singler_df <- singler_files |>
purrr::map(readr::read_tsv) |>
purrr::list_rbind(names_to = "library_id") |>
dplyr::select(-delta.next, -labels) |>
dplyr::mutate(
# add cell id column for unique rows & joining
cell_id = glue::glue("{library_id}-{barcodes}"),
# recode NA -> "unknown_singler"
pruned.labels = ifelse(
is.na(pruned.labels),
"unknown_singler",
pruned.labels
)
)
# validation data frames
validation_df <- readr::read_tsv(validation_url) |>
dplyr::select(consensus_annotation, validation_group_annotation)
consensus_markers_df <- readr::read_tsv(consensus_markers_file)
nbatlas_markers_df <- readr::read_tsv(nbatlas_markers_file)
# set up palette for umaps
palette_df <- readr::read_tsv(palette_file)
celltype_pal <- palette_df$hex_color
names(celltype_pal) <- palette_df$harmonized_label
Join and prepare data for use:
celltype_df <- scuttle::makePerCellDF(
merged_sce,
use.coldata = c("barcodes", "sample_id", "library_id", "consensus_celltype_annotation"),
use.dimred = c("UMAP")
) |>
# there are repeated barcodes so we need to keep cell_id around
tibble::rownames_to_column("cell_id") |>
dplyr::rename(
UMAP1 = UMAP.1,
UMAP2 = UMAP.2,
consensus_annotation = consensus_celltype_annotation
) |>
dplyr::left_join(validation_df, by = "consensus_annotation") |>
# recode NAs to "unknown_consensus" and remove full consensus labels
dplyr::mutate(validation_group_annotation = ifelse(
is.na(validation_group_annotation),
"unknown_consensus",
validation_group_annotation
)) |>
dplyr::select(-consensus_annotation) |>
dplyr::left_join(singler_df, by = c("cell_id", "barcodes", "library_id"))
# Recode NBAtlas cell types where possible so they match with colors
celltype_recoded_df <- celltype_df |>
# rename to make annotation sources more clear
dplyr::rename(
"consensus_validation" = validation_group_annotation,
"singler" = pruned.labels
) |>
# pivot longer for wrangling
tidyr::pivot_longer(
c(consensus_validation, singler),
names_to = "annotation_type",
values_to = "label"
) |>
harmonize_celltypes(label, label_recoded) |>
dplyr::select(-label)
This section compares SingleR annotations to consensus cell type annotations using a heatmap. The heatmap is colored by Jaccard similarity index.
# Pivot wider for heatmap functions
celltype_recoded_wide_df <- celltype_recoded_df |>
tidyr::pivot_wider(
names_from = annotation_type,
values_from = label_recoded
)
heatmap_col_fun <- circlize::colorRamp2(c(0, 1), colors = c("white", "darkslateblue"))
make_jaccard_heatmap(
celltype_recoded_wide_df,
"consensus_validation",
"singler",
"Consensus validation label",
"SingleR NBAtlas label"
)
This section visualizes and compares SingleR annotations to consensus cell type annotations using UMAPs. Note that the displayed UMAP is from a merged object that has not been batch-corrected.
Cell type labels have been harmonized between sources wherever
possible. Note that each set of labels has its own “stromal” category
which the labels distinguish. In addition, cells labeled
unknown_consensus are those with no assigned consensus
label, and cells labeled unknown_singler are those where
SingleR could not confidently assign a label.
First, we display the consensus and SingleR annotations for all cells.
ggplot(celltype_recoded_df) +
aes(x = UMAP1, y = UMAP2, color = label_recoded) +
geom_point(size = 0.25, alpha = 0.5) +
scale_color_manual(
values = celltype_pal,
name = "Harmonized cell types"
) +
facet_wrap(vars(annotation_type)) +
coord_equal() +
theme(
legend.position = "bottom",
axis.ticks = element_blank(),
axis.text = element_blank()
) +
guides(color = guide_legend(override.aes = list(size = 2, alpha = 1)))
Below we display a faceted UMAP to highlight the SingleR annotations. Light gray cells in each panel represent all other cell types.
celltype_recoded_df |>
dplyr::filter(annotation_type == "singler") |>
faceted_umap(
annotation_column = label_recoded,
celltype_colors = celltype_pal
)
Below, we display faceted UMAPs highlighting a single cell type but considering only cell types that have direct correspondence between SingleR and consensus cell types. This allows us to see if the normal cells that SingleR is labeling correspond well to the normal cells identified by consensus labels. Each row displays a cell type where the left column shows the consensus version and the right column shows the SingleR version.
In addition, we include a category below putative-tumor
to directly compare the unknown consensus labels with the
Neuroendocrine cells labeled by SingleR. While these
categories are not necessarily directly comparable, they are each most
likely to contain tumor cells.
direct_celltype_matches <- c(
"B cell",
"T cell",
"myeloid",
"fibroblast",
"endothelial cell",
"plasma cell",
"natural killer cell",
# we'll use this label to be able to directly compare unknown_consensus to Neuroendocrine
"putative-tumor"
)
celltype_facet_df <- celltype_recoded_df |>
# recode so Neuroendocrine matches with unknown_consensus in the plot
dplyr::mutate(
label_recoded = ifelse(
label_recoded %in% c("Neuroendocrine", "unknown_consensus"),
"putative-tumor",
label_recoded
)
) |>
dplyr::filter(label_recoded %in% direct_celltype_matches)
faceted_umap(
celltype_facet_df,
annotation_column = label_recoded,
celltype_colors = celltype_pal,
facet_type = "grid",
annotation_type_column = annotation_type
)
In this section we’ll validate annotations using two sets of marker genes:
# Prepare the expressed_genes vector
# we only care about if that gene is expressed otherwise we won't waste memory and include it
gene_sums <- rowData(merged_sce) |>
as.data.frame() |>
dplyr::select(contains("detected")) |>
as.matrix() |>
rowSums()
expressed_genes <- names(gene_sums)[gene_sums > 0]
# Prepare data frame with singler labels to plot
singler_recoded_df <- celltype_recoded_df |>
dplyr::filter(
annotation_type == "singler",
# we don't consider the NAs here
label_recoded != "unknown_singler"
) |>
dplyr::select(cell_id, label_recoded)
# get total number of cells per final annotation group and set up y_label
total_cells_df <- singler_recoded_df |>
dplyr::count(label_recoded, name = "total_cells") |>
dplyr::arrange(desc(total_cells)) |>
dplyr::mutate(y_label = glue::glue("{label_recoded} ({total_cells})"))
singler_order <- total_cells_df$y_label
total_cells_df$y_label <- factor(total_cells_df$y_label, levels = singler_order)
We’ll show the top-seven highest logFC marker genes per cell type for validation.
# number of marker genes to consider per validation group
n_marker_genes <- 7
nbatlas_markers_df <- nbatlas_markers_df |>
# keep only the top `n_marker_genes`
dplyr::filter(direction == "up") |>
dplyr::group_by(NBAtlas_label) |>
dplyr::mutate(rank_lfc = rank(-avg_log2FC)) |>
dplyr::ungroup() |>
dplyr::filter(rank_lfc <= n_marker_genes) |>
harmonize_celltypes(NBAtlas_label, marker_gene_label) |>
dplyr::select(marker_gene_label, gene_symbol, ensembl_gene_id)
nbatlas_bar_order <- total_cells_df$label_recoded
generate_dotplot(
merged_sce,
nbatlas_markers_df,
singler_recoded_df,
total_cells_df,
expressed_genes,
nbatlas_bar_order
)
`summarise()` has grouped output by 'label_recoded', 'marker_gene_label'. You
can override using the `.groups` argument.
# prepare for dotplot
consensus_markers_df <- consensus_markers_df |>
# we'll use this to help determine the bar order
harmonize_celltypes(NBAtlas_label, NBAtlas_label_recoded) |>
dplyr::select(
marker_gene_label = validation_group_annotation,
ensembl_gene_id,
gene_symbol,
NBAtlas_label_recoded
)
# get the bar order
consensus_bar_order <- total_cells_df |>
dplyr::select(label_recoded, y_label) |>
# inner!!!!
dplyr::inner_join(consensus_markers_df, by = c("label_recoded" = "NBAtlas_label_recoded")) |>
dplyr::arrange(y_label) |>
dplyr::pull(marker_gene_label) |>
unique()
generate_dotplot(
merged_sce,
consensus_markers_df,
singler_recoded_df,
total_cells_df,
expressed_genes,
consensus_bar_order
)
`summarise()` has grouped output by 'label_recoded', 'marker_gene_label'. You
can override using the `.groups` argument.
# record the versions of the packages used in this analysis and other environment information
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS 15.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/New_York
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
[3] Biobase_2.64.0 GenomicRanges_1.56.2
[5] GenomeInfoDb_1.40.1 IRanges_2.38.1
[7] S4Vectors_0.42.1 BiocGenerics_0.50.0
[9] MatrixGenerics_1.16.0 matrixStats_1.5.0
[11] patchwork_1.3.0 ggplot2_3.5.2
loaded via a namespace (and not attached):
[1] tidyselect_1.2.1 viridisLite_0.4.2
[3] dplyr_1.1.4 farver_2.1.2
[5] bitops_1.0-9 fastmap_1.2.0
[7] digest_0.6.37 lifecycle_1.0.4
[9] cluster_2.1.8 Cairo_1.6-2
[11] magrittr_2.0.3 compiler_4.4.0
[13] rlang_1.1.6 sass_0.4.10
[15] tools_4.4.0 yaml_2.3.10
[17] knitr_1.50 labeling_0.4.3
[19] S4Arrays_1.4.1 bit_4.6.0
[21] curl_6.3.0 DelayedArray_0.30.1
[23] plyr_1.8.9 RColorBrewer_1.1-3
[25] abind_1.4-8 BiocParallel_1.38.0
[27] withr_3.0.2 purrr_1.0.4
[29] grid_4.4.0 beachmat_2.20.0
[31] colorspace_2.1-1 scales_1.4.0
[33] iterators_1.0.14 cli_3.6.5
[35] rmarkdown_2.29 crayon_1.5.3
[37] generics_0.1.4 httr_1.4.7
[39] tzdb_0.5.0 rjson_0.2.23
[41] DelayedMatrixStats_1.26.0 scuttle_1.14.0
[43] cachem_1.1.0 stringr_1.5.1
[45] zlibbioc_1.50.0 parallel_4.4.0
[47] BiocManager_1.30.26 XVector_0.44.0
[49] vctrs_0.6.5 Matrix_1.7-1
[51] jsonlite_2.0.0 GetoptLong_1.0.5
[53] hms_1.1.3 bit64_4.6.0-1
[55] clue_0.3-66 jpeg_0.1-11
[57] foreach_1.5.2 tidyr_1.3.1
[59] jquerylib_0.1.4 glue_1.8.0
[61] codetools_0.2-20 stringi_1.8.7
[63] shape_1.4.6.1 gtable_0.3.6
[65] UCSC.utils_1.0.0 ComplexHeatmap_2.20.0
[67] tibble_3.3.0 pillar_1.10.2
[69] htmltools_0.5.8.1 GenomeInfoDbData_1.2.12
[71] circlize_0.4.16 R6_2.6.1
[73] sparseMatrixStats_1.16.0 doParallel_1.0.17
[75] rprojroot_2.0.4 vroom_1.6.5
[77] evaluate_1.0.3 lattice_0.22-6
[79] readr_2.1.5 png_0.1-8
[81] renv_1.1.3 bslib_0.9.0
[83] ggmap_4.0.1 Rcpp_1.0.14
[85] SparseArray_1.4.8 xfun_0.52
[87] pkgconfig_2.0.3 GlobalOptions_0.1.2